from pyspark.sql import SparkSession

# 广播变量就可以将闭包的数据保存到Executor的内存中，spark中的广播变量不能够更改（广播变量称为分布式只读共享变量）
if __name__ == '__main__':
    spark = SparkSession.builder.appName("broadcast_learn").master("local").getOrCreate()
    data = [('Alex', 'male', 3), ('Nancy', 'female', 6), ['Jack', 'male', 9]]
    d_map = dict()
    d_map['Alex'] = 1
    d_map['Nancy'] = 2
    d_map['Jack'] = 3
    bc = spark.sparkContext.broadcast(d_map)
    rdd = spark.sparkContext.parallelize(data)

    print(bc.value)

    def age_func(arr):
        tmp = list()
        tmp.extend(arr)
        tmp[2] = bc.value.get(tmp[0])
        return tmp


    rdd.map(age_func).foreach(lambda arr: print(arr))
    spark.stop()
